%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import networkx as nx
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
px.set_mapbox_access_token(open("./mapbox_token").read())
from plotly.subplots import make_subplots
from bs4 import BeautifulSoup
from warnings import filterwarnings
filterwarnings('ignore')
from datetime import datetime
import requests
# Algorithms
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
# Matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
# Diagram
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.externals.six import StringIO
from IPython.display import Image
caseData = pd.read_csv('covid/case.csv')
caseDataForMap = caseData.copy()
caseDataForMap = caseDataForMap[caseDataForMap[['latitude', 'longitude']] != '-']
cols = ['latitude', 'longitude']
caseData[cols] = caseData[cols].apply(pd.to_numeric, errors='coerce', axis=1)
display(caseData.head())
fig = px.scatter_mapbox(
caseData[caseData.latitude != '-'],
text="<br>City: " + caseData["city"] +" <br>Province: " + caseData["province"],
lat="latitude",
lon="longitude",
color="confirmed",
size="confirmed",
color_continuous_scale=px.colors.sequential.Burg,
size_max=100,
mapbox_style='dark',
zoom=6,
title="Overview of the cases in Korea")
fig.show()
Majority of the cases is concentrated in Daegu while the rest of the areas have lower amount of cases.
It seems there are 2 huge cluster in Korea regarding Covid-19 cases. One of the cluster is in the province of Daegu.
And another cluster is in the province of seoul
caseData = pd.read_csv('covid/case.csv')
caseDataForMap = caseData.copy()
caseDataSorted = caseDataForMap.sort_values(by=['confirmed'], ascending=False)
display(caseDataSorted.head())
sortedValues = caseDataForMap.groupby(['province','group']).sum().sort_values(by=['confirmed'], ascending=False).reset_index()
sortedValues = sortedValues[['province','confirmed','group']]
display(sortedValues.head())
fig = px.bar(sortedValues, x='confirmed', y='province', color='group', barmode='group')
fig.update_layout(hovermode='y')
fig.show()
In general, there are more group cases than non-group cases in korea.
The only prominent exception is Busan.
This may imply that people are more prone to covid-19 when they are in group
locations = caseData.pivot("province", "infection_case", "confirmed")
display(locations.head())
f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(locations, cmap="gnuplot", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Numbers of cases sort by source and province')
# Used log, because daegu has high amount of infection which highly skewed the data
caseData['logConfirmed'] = np.log(caseData['confirmed'])
locations = caseData.pivot("province", "infection_case", "logConfirmed")
display(locations.head())
f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(locations, cmap="gnuplot", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Numbers of cases sort by source and province')
fig = px.sunburst(caseDataSorted,
path=['province', 'infection_case'],
values='confirmed',
width=1000, height=600)
fig.show()
The top covid sources for
fig = px.sunburst(caseDataSorted,
path=['infection_case', 'province'],
values='confirmed',
width=1000, height=600)
fig.show()
The top 5 sources for covid-19 comes from
caseData = pd.read_csv('covid/case.csv')
caseDataForMap = caseData.copy()
caseDataForMap['logConfirmed'] = np.log(caseDataForMap['confirmed'])
display(caseDataForMap.head())
fig = px.box(caseDataForMap, x="logConfirmed", y="province")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.update_layout(hovermode='y')
fig.show()
caseDataForMap = caseData.copy()
sortedValues = caseDataForMap.groupby(['province']).sum().sort_values(by=['confirmed'], ascending=False).reset_index()
sortedValues['logConfirmed'] = np.log(sortedValues['confirmed'])
sortedValues = sortedValues[['province','confirmed','logConfirmed']]
display(sortedValues.head())
print('Mean: ', sortedValues.mean().values)
print('Standard Derivation: ', sortedValues.std().values)
This amount of COVID-19 cases has a mean of 670 and a standard deviation of 1610.
Hence we sort the provinces into 4 different types of level:
def calculate_risk_level(sortedValues) :
if sortedValues['confirmed'] >= 2000:
return 4
elif sortedValues['confirmed'] >= 1000 and sortedValues['confirmed'] < 2000:
return 3
elif sortedValues['confirmed'] >= 100 and sortedValues['confirmed'] < 900:
return 2
else:
return 1
sortedValues['Risk Level'] = sortedValues.apply(calculate_risk_level,axis=1);
sortedValues = sortedValues[['province','Risk Level']]
display(sortedValues.head())
patientData = pd.read_csv('covid/patientinfo.csv')
display(patientData.head())
groupPatientData = patientData.groupby('confirmed_date').size().reset_index()
groupPatientData.columns = ['confirmed_date', 'count']
fig = px.line(groupPatientData, x="confirmed_date", y="count", title='Numbers Of Covid Cases Per Day')
fig.update_layout(hovermode='x')
fig.show()
The covid cases has a spike from Feb-Mar period before it gradually reduces from Mar-May period.
During June, the covid cases had a minor rebound.
confinedDaysAnalysis = patientData.copy()
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['age'] != '100s'] #Just a single data which is highly skewed
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['released_date'].notnull()]
cols = ['released_date', 'confirmed_date']
confinedDaysAnalysis[cols] = confinedDaysAnalysis[cols].apply(pd.to_datetime, errors='coerce', axis=1)
confinedDaysAnalysis['Total Treatment Days'] = (confinedDaysAnalysis['released_date'] - confinedDaysAnalysis['confirmed_date']).dt.days
groupedData = confinedDaysAnalysis.groupby(['sex', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["sex", "age", "Average Treatment Days"]
dataForHeatmap = groupedData.pivot("sex", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Survivor Treatment Day, Sort by Age & Gender')
In general, the older the patient, the longer it will take for them to recover
confinedDaysAnalysis['age'] = confinedDaysAnalysis['age'].dropna()
age = {'0s':0, '10s':1, '20s':2, '30s':3, '40s':4, '50s':5, '60s':6, '70s':7, '80s':8, '90s':9}
confinedDaysAnalysis['age'] = confinedDaysAnalysis['age'].map(age)
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['age'].notna()]
fig = px.scatter(confinedDaysAnalysis, x="age", y="Total Treatment Days", color="sex", trendline="ols", title="Does age group affect total treatment days?")
fig.update_layout(hovermode='y')
fig.show()
With a R squared value of 0.03, it seems that the age of the patient has little effect on the number of days it take for them to recover from covid-19
groupedData = confinedDaysAnalysis.groupby(['province', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["province", "age", "Average Treatment Days"]
groupedData['Average Treatment Days'] = np.ceil(groupedData['Average Treatment Days'].apply(pd.to_numeric, errors='coerce'))
dataForHeatmap = groupedData.pivot("province", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Survivor Treatment Day, Sort by City & Age')
confinedDaysAnalysis = patientData.copy()
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['age'] != '100s'] #Just a single data which is highly skewed
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['deceased_date'].notnull()]
cols = ['deceased_date', 'confirmed_date']
confinedDaysAnalysis[cols] = confinedDaysAnalysis[cols].apply(pd.to_datetime, errors='coerce', axis=1)
confinedDaysAnalysis['Total Treatment Days'] = (confinedDaysAnalysis['deceased_date'] - confinedDaysAnalysis['confirmed_date']).dt.days
display(confinedDaysAnalysis.head())
groupedData = confinedDaysAnalysis.groupby(['sex', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["sex", "age", "Average Treatment Days"]
dataForHeatmap = groupedData.pivot("sex", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Deceased Treatment Day, Sort by Age & Gender')
groupedData = confinedDaysAnalysis.groupby(['province', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["province", "age", "Average Treatment Days"]
groupedData['Average Treatment Days'] = np.ceil(groupedData['Average Treatment Days'].apply(pd.to_numeric, errors='coerce'))
dataForHeatmap = groupedData.pivot("province", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Deceased Treatment Day, Sort by City & Age')
survivorCountAnalysis = patientData.copy()
survivorCountAnalysis['survive'] = survivorCountAnalysis['released_date'].notnull()
survivorCountAnalysis['deceased'] = survivorCountAnalysis['deceased_date'].notnull()
survivorCountAnalysis['under treatment'] = survivorCountAnalysis['deceased_date'].isnull() & survivorCountAnalysis['released_date'].isnull()
provinceStats = survivorCountAnalysis.groupby(['province']).sum()
provinceStatsClean = provinceStats[['survive', 'deceased', 'under treatment']]
provinceStatsClean['survive %'] = np.round(provinceStatsClean['survive'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['deceased %'] = np.round(provinceStatsClean['deceased'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['under treatment %'] = np.round(provinceStatsClean['under treatment'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsAbsolute = provinceStatsClean[['survive', 'deceased', 'under treatment']]
provinceStatsPercentage = provinceStatsClean[['survive %', 'deceased %', 'under treatment %']]
display(provinceStatsAbsolute.head())
display(provinceStatsPercentage.head())
newTable = provinceStatsPercentage.reset_index()
display(newTable.head())
newTable = pd.melt(newTable, id_vars="province", var_name="stats", value_name="rate")
display(newTable.head())
fig = px.bar(newTable, x='rate', y='province', color='stats', barmode='group')
fig.update_layout(hovermode='y')
fig.show()
survivorCountAnalysis = patientData.copy()
survivorCountAnalysis['survive'] = survivorCountAnalysis['released_date'].notnull()
survivorCountAnalysis['deceased'] = survivorCountAnalysis['deceased_date'].notnull()
survivorCountAnalysis['under treatment'] = survivorCountAnalysis['deceased_date'].isnull() & survivorCountAnalysis['released_date'].isnull()
provinceStats = survivorCountAnalysis.groupby(['sex']).sum()
provinceStatsClean = provinceStats[['survive', 'deceased', 'under treatment']]
provinceStatsClean['survive %'] = np.round(provinceStatsClean['survive'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['deceased %'] = np.round(provinceStatsClean['deceased'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['under treatment %'] = np.round(provinceStatsClean['under treatment'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsAbsolute = provinceStatsClean[['survive', 'deceased', 'under treatment']]
provinceStatsPercentage = provinceStatsClean[['survive %', 'deceased %', 'under treatment %']]
display(provinceStatsAbsolute)
display(provinceStatsPercentage)
newTable = provinceStatsPercentage.reset_index()
newTable = pd.melt(newTable, id_vars="sex", var_name="stats", value_name="rate")
display(newTable.head())
fig = px.bar(newTable, x='rate', y='sex', color='stats', barmode='group')
fig.update_layout(hovermode='y')
fig.show()
networkData = patientData.copy()
networkData = networkData[networkData['infected_by'].notnull()]
networkData = networkData[['patient_id','sex','age','province','city','infection_case','infected_by','state']]
display(networkData.head())
A = list(networkData["infected_by"].unique())
B = list(networkData["patient_id"].unique())
node_list = set(A+B)
# Create Graph
G = nx.Graph()
for i in node_list:
G.add_node(i)
# G.nodes()
for i,j in networkData.iterrows():
G.add_edges_from([(j["infected_by"],j["patient_id"])])
pos = nx.spring_layout(G, k=0.5, iterations=50)
for n, p in pos.items():
G.nodes[n]['pos'] = p
edge_trace = go.Scatter(
x=[],
y=[],
line=dict(width=0.5,color='#888'),
hoverinfo='none',
mode='lines')
for edge in G.edges():
x0, y0 = G.nodes[edge[0]]['pos']
x1, y1 = G.nodes[edge[1]]['pos']
edge_trace['x'] += tuple([x0, x1, None])
edge_trace['y'] += tuple([y0, y1, None])
node_trace = go.Scatter(
x=[],
y=[],
text=[],
mode='markers',
hoverinfo='text',
marker=dict(
showscale=True,
colorscale='RdPu_r',
reversescale=True,
color=[],
size=15,
colorbar=dict(
thickness=10,
title='Numnber of Infected Cases',
xanchor='left',
titleside='right'
),
line=dict(width=0)))
for node in G.nodes():
x, y = G.nodes[node]['pos']
node_trace['x'] += tuple([x])
node_trace['y'] += tuple([y])
for node, adjacencies in enumerate(G.adjacency()):
node_trace['marker']['color']+=tuple([len(adjacencies[1])])
node_info = str(adjacencies[0]) +' # of connections: '+ str(len(adjacencies[1]))
node_trace['text']+=tuple([node_info])
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='<br>Korea Covid Network Connections',
titlefont=dict(size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
text="",
showarrow=False,
xref="paper", yref="paper") ],
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
iplot(fig)
fig = px.sunburst(networkData, path=['province','infected_by'])
fig.show()
networkData = patientData.copy()
networkData = networkData[networkData['infected_by'].notnull()]
networkData = networkData[['sex','age','province','state','infection_case']]
# print('With NA values')
# display(networkData.count())
# fig = px.parallel_categories(networkData, color_continuous_scale=px.colors.sequential.Inferno)
# fig.show()
# print('Without NA values')
networkData = networkData.dropna()
display(networkData.count())
fig = px.parallel_categories(networkData, color_continuous_scale=px.colors.sequential.Inferno)
fig.show()
searchData = pd.read_csv('covid/searchtrend.csv')
searchDataCopy = searchData.copy()
searchDataMelted = pd.melt(searchDataCopy,
id_vars=['date'],
value_vars=['cold', 'flu','pneumonia','coronavirus'])
searchDataMelted = searchDataMelted[pd.to_datetime(searchDataMelted['date']).dt.year > 2019]
display(searchDataMelted.head())
policyData = pd.read_csv('covid/policy.csv')
policyDataCopy = policyData.copy()
policyDataCopy['end_date'] = policyDataCopy['end_date'].fillna(datetime.now().strftime('%Y-%m-%d'))
display(policyDataCopy.head())
df = []
for index, row in policyDataCopy.iterrows():
df.append(dict(Task=row['gov_policy'], Start=row['start_date'], Finish=row['end_date'], Resource=row['type']))
fig = px.timeline(df, x_start="Start", x_end="Finish", y="Task", color="Resource", title="Policy over time")
fig.update_layout(hovermode='x')
fig.show()
patientData = pd.read_csv('covid/patientinfo.csv')
groupPatientData = patientData.groupby('confirmed_date').size().reset_index()
groupPatientData.columns = ['confirmed_date', 'count']
fig = px.line(searchDataMelted, x="date", y="value", color="variable", title='Search Trend over Time')
fig.update_layout(hovermode='x')
fig.show()
fig = px.line(groupPatientData, x="confirmed_date", y="count", title='Numbers Of Covid Cases Per Day')
fig.update_layout(hovermode='x')
fig.show()
timeData = pd.read_csv('covid/time.csv')
timeDataMelted = pd.melt(timeData, id_vars=['date'], value_vars=['test', 'negative','confirmed','released','deceased'])
display(timeDataMelted.head())
fig = px.line(timeDataMelted, x="date", y="value", color='variable', title="Overall cases over time")
fig.update_layout(hovermode='x')
fig.show()
timeDataMelted = pd.melt(timeData, id_vars=['date'], value_vars=['confirmed','released','deceased'])
display(timeDataMelted.head())
fig = px.line(timeDataMelted, x="date", y="value", color='variable', title="Confirmed, released & deceased over time")
fig.update_layout(hovermode='x')
fig.show()
miniTimeData = timeData[['date','test','negative']]
miniTimeData['Test increase'] = miniTimeData['test'] - miniTimeData['test'].shift(1)
miniTimeData['Negative increase'] = miniTimeData['negative'] - miniTimeData['negative'].shift(1)
miniTimeData['Month'] = pd.to_datetime(miniTimeData['date']).dt.month
miniTimeData['Day'] = pd.to_datetime(miniTimeData['date']).dt.day_name()
display(miniTimeData.head())
miniTimeDataDay = miniTimeData[['Test increase', 'Negative increase', 'Day']]
miniTimeDataDay = miniTimeDataDay.groupby('Day')['Test increase', 'Negative increase'].mean()
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
miniTimeDataDay = miniTimeDataDay.reindex(cats)
miniTimeDataDay = miniTimeDataDay.reset_index()
miniTimeDataDayMelted = pd.melt(miniTimeDataDay, id_vars=['Day'], value_vars=['Test increase','Negative increase'])
display(miniTimeDataDayMelted.head())
fig = px.bar(miniTimeDataDayMelted, x='Day', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
In general, there are more test conducted during the weekdays
and the number of test conducted is lesser during the weekday
The result might take one day to process which causes a lag in the result of the test cases
miniTimeDataMonth = miniTimeData[['Test increase', 'Negative increase', 'Month']]
miniTimeDataMonth = miniTimeDataMonth.groupby('Month')['Test increase', 'Negative increase'].mean()
miniTimeDataMonth = miniTimeDataMonth.reset_index()
miniTimeDataMonthMelted = pd.melt(miniTimeDataMonth, id_vars=['Month'], value_vars=['Test increase','Negative increase'])
display(miniTimeDataMonthMelted.head())
fig = px.bar(miniTimeDataMonthMelted, x='Month', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
There are more test conducted during March and June.
This trend also correspond to the the number of covid-19 cases in Korea
It seems that there is a spiked in test conducted whether there is an increase in covid-19 cases
miniTimeData = timeData[['date','confirmed','released','deceased']]
miniTimeData['confirmed increase'] = miniTimeData['confirmed'] - miniTimeData['confirmed'].shift(1)
miniTimeData['released increase'] = miniTimeData['released'] - miniTimeData['released'].shift(1)
miniTimeData['deceased increase'] = miniTimeData['deceased'] - miniTimeData['deceased'].shift(1)
miniTimeData['Month'] = pd.to_datetime(miniTimeData['date']).dt.month
miniTimeData['Day'] = pd.to_datetime(miniTimeData['date']).dt.day_name()
display(miniTimeData.head())
miniTimeDataDay = miniTimeData[['confirmed increase', 'released increase','deceased increase', 'Day']]
miniTimeDataDay = miniTimeDataDay.groupby('Day')['confirmed increase', 'released increase','deceased increase'].mean()
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
miniTimeDataDay = miniTimeDataDay.reindex(cats)
miniTimeDataDay = miniTimeDataDay.reset_index()
miniTimeDataDayMelted = pd.melt(miniTimeDataDay, id_vars=['Day'], value_vars=['confirmed increase', 'released increase','deceased increase'])
display(miniTimeDataDayMelted.head())
fig = px.bar(miniTimeDataDayMelted, x='Day', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
miniTimeDataMonth = miniTimeData[['confirmed increase', 'released increase','deceased increase', 'Month']]
miniTimeDataMonth = miniTimeDataMonth.groupby('Month')['confirmed increase', 'released increase','deceased increase'].mean()
miniTimeDataMonth = miniTimeDataMonth.reset_index()
miniTimeDataMonthMelted = pd.melt(miniTimeDataMonth, id_vars=['Month'], value_vars=['confirmed increase', 'released increase','deceased increase'])
display(miniTimeDataMonthMelted.head())
fig = px.bar(miniTimeDataMonthMelted, x='Month', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
timeAgeData = pd.read_csv('covid/timeage.csv')
display(timeAgeData.head())
fig = px.line(timeAgeData, x="date", y="confirmed", color='age', title="Confirmed cases of various age group over time")
fig.update_layout(hovermode='x')
fig.show()
timeAgeData = pd.read_csv('covid/timeage.csv')
fig = px.line(timeAgeData, x="date", y="deceased", color='age', title="Deceased cases of various age group over time")
fig.update_layout(hovermode='x')
fig.show()
timeAgeData = pd.read_csv('covid/timeage.csv')
timeAgeData['month'] = pd.to_datetime(timeAgeData['date']).dt.month
display(timeAgeData.head())
fig = px.scatter(timeAgeData,
x='confirmed',
y='deceased',
color="age",
size_max=100,
size="deceased",
animation_frame="date",
animation_group="age",
range_x=[0, timeAgeData['confirmed'].max()],
range_y=[0, timeAgeData['deceased'].max() + timeAgeData['deceased'].std()]
)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 200
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 200
fig.layout.coloraxis.showscale = False
fig.layout.sliders[0].pad.t = 10
fig.layout.updatemenus[0].pad.t= 10
fig.show()
display(timeAgeData.head())
timeAgeData = timeAgeData[~timeAgeData['age'].isin(['0s','10s','20s'])]
timeAgeData['month'] = pd.to_datetime(timeAgeData['date']).dt.month
fig = px.scatter(timeAgeData, x="confirmed", y="deceased", animation_frame="date", animation_group="age",
size="deceased", color="age", hover_name="age", facet_col="age",
range_x=[0, timeAgeData['confirmed'].max()],
range_y=[0, timeAgeData['deceased'].max()])
fig.show()
timeGender = pd.read_csv('covid/timegender.csv')
display(timeGender.head())
fig = px.line(timeGender, x="date", y="confirmed", color='sex', title="Confirmed cases between genders over time")
fig.update_layout(hovermode='x')
fig.show()
fig = px.line(timeGender, x="date", y="deceased", color='sex', title="Deceased cases between genders over time")
fig.update_layout(hovermode='x')
fig.show()
timeGender = pd.read_csv('covid/timegender.csv')
display(timeGender.head())
fig = px.scatter(timeGender,
x='confirmed',
y='deceased',
color="sex",
size_max=100,
size="deceased",
animation_frame="date",
animation_group="sex",
range_x=[0, timeGender['confirmed'].max() + timeGender['confirmed'].std()],
range_y=[0, timeGender['deceased'].max() + timeGender['deceased'].std()]
)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 200
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 200
fig.layout.coloraxis.showscale = False
fig.layout.sliders[0].pad.t = 10
fig.layout.updatemenus[0].pad.t= 10
fig.show()
timeGender = pd.read_csv('covid/timegender.csv')
timeGenderAnalysis = timeGender.copy()
timeGenderAnalysis['confirmed increase'] = timeGenderAnalysis['confirmed'] - timeGenderAnalysis['confirmed'].shift(2)
timeGenderAnalysis['deceased increase'] = timeGenderAnalysis['deceased'] - timeGenderAnalysis['deceased'].shift(2)
timeGenderAnalysis['month'] = pd.to_datetime(timeGenderAnalysis['date']).dt.month
timeGenderAnalysis['day'] = pd.to_datetime(timeGenderAnalysis['date']).dt.day_name()
timeGenderAnalysisMonth = timeGenderAnalysis.groupby(['month','sex'])['confirmed increase'].mean()
timeGenderAnalysisMonth = timeGenderAnalysisMonth.reset_index()
timeGenderAnalysisMonthMelted = pd.melt(timeGenderAnalysisMonth, id_vars=['month','sex'], value_vars=['confirmed increase'])
display(timeGenderAnalysisMonthMelted.head())
fig = px.bar(timeGenderAnalysisMonthMelted, x='month', y='value', color='sex', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
timeGenderAnalysisMonth = timeGenderAnalysis.groupby(['day','sex'])['confirmed increase'].mean()
timeGenderAnalysisMonth = timeGenderAnalysisMonth.reset_index()
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
timeGenderAnalysisMonth['day'] = pd.Categorical(timeGenderAnalysisMonth['day'],categories=weekdays)
timeGenderAnalysisMonth = timeGenderAnalysisMonth.sort_values('day')
timeGenderAnalysisMonthMelted = pd.melt(timeGenderAnalysisMonth, id_vars=['day','sex'], value_vars=['confirmed increase'])
display(timeGenderAnalysisMonthMelted.head())
fig = px.bar(timeGenderAnalysisMonthMelted, x='day', y='value', color='sex', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
timeGenderAnalysisMonth = timeGenderAnalysis.groupby(['month','sex'])['deceased increase'].mean()
timeGenderAnalysisMonth = timeGenderAnalysisMonth.reset_index()
timeGenderAnalysisMonthMelted = pd.melt(timeGenderAnalysisMonth, id_vars=['month','sex'], value_vars=['deceased increase'])
display(timeGenderAnalysisMonthMelted.head())
fig = px.bar(timeGenderAnalysisMonthMelted, x='month', y='value', color='sex', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
timeGenderAnalysisMonth = timeGenderAnalysis.groupby(['day','sex'])['deceased increase'].mean()
timeGenderAnalysisMonth = timeGenderAnalysisMonth.reset_index()
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
timeGenderAnalysisMonth['day'] = pd.Categorical(timeGenderAnalysisMonth['day'],categories=weekdays)
timeGenderAnalysisMonth = timeGenderAnalysisMonth.sort_values('day')
timeGenderAnalysisMonthMelted = pd.melt(timeGenderAnalysisMonth, id_vars=['day','sex'], value_vars=['deceased increase'])
display(timeGenderAnalysisMonthMelted.head())
fig = px.bar(timeGenderAnalysisMonthMelted, x='day', y='value', color='sex', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
timeProvince = pd.read_csv('covid/timeprovince.csv')
display(timeProvince.head())
fig = px.line(timeProvince, x="date", y="confirmed", color='province', title="Confirmed cases of various province over time")
fig.update_layout(hovermode='x')
fig.show()
fig = px.line(timeProvince, x="date", y="released", color='province', title="Released cases of various province over time")
fig.update_layout(hovermode='x')
fig.show()
fig = px.line(timeProvince, x="date", y="deceased", color='province', title="Deceased cases of various province over time")
fig.update_layout(hovermode='x')
fig.show()
timeProvince = pd.read_csv('covid/timeprovince.csv')
display(timeProvince.head())
fig = px.scatter(timeProvince,
y='released',
x='confirmed',
color="province",
size_max=100,
size="deceased",
animation_frame="date",
animation_group="province",
range_y=[0, timeProvince['confirmed'].max() + timeProvince['confirmed'].std()],
range_x=[0, timeProvince['released'].max() + timeProvince['released'].std()]
)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 200
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 200
fig.layout.coloraxis.showscale = False
fig.layout.sliders[0].pad.t = 10
fig.layout.updatemenus[0].pad.t= 10
fig.show()
timeProvince = pd.read_csv('covid/timeprovince.csv')
timeProvince = timeProvince[timeProvince['province'] != 'Daegu']
display(timeProvince.head())
fig = px.scatter(timeProvince,
y='released',
x='confirmed',
color="province",
size_max=100,
size="deceased",
animation_frame="date",
animation_group="province",
range_y=[0, timeProvince['confirmed'].max() + timeProvince['confirmed'].std()],
range_x=[0, timeProvince['released'].max() + timeProvince['released'].std()]
)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 200
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 200
fig.layout.coloraxis.showscale = False
fig.layout.sliders[0].pad.t = 10
fig.layout.updatemenus[0].pad.t= 10
fig.show()
timeProvince = pd.read_csv('covid/timeprovince.csv')
dataForDaegu = timeProvince[timeProvince['province'] == 'Daegu']
dataForDaegu = dataForDaegu[['date','province','confirmed']]
dataForDaegu['increasePerDay'] = dataForDaegu['confirmed'] - dataForDaegu['confirmed'].shift(1)
dataForDaegu = dataForDaegu[['date','province','increasePerDay']]
dataForDaegu = dataForDaegu.rename({'increasePerDay':'Increase Per Day'}, axis=1)
display(dataForDaegu.head())
fig = px.line(dataForDaegu, x="date", y="Increase Per Day", title="Daegu Cases Per Day")
fig.update_layout(hovermode='x')
fig.show()
Daeugu's Cases is most active during Feb to Apr period.
# Uncommend to focus data on active period
# dataForDaegu = dataForDaegu.set_index('date')
# dataForDaegu = dataForDaegu['2020-02-01' : '2020-04-31'].reset_index()
dataForDaegu['10 Days Moving Average'] = dataForDaegu['Increase Per Day'].rolling(10).mean()
dataForDaegu['Upper Limit 10 day Bollinger Band'] = dataForDaegu['Increase Per Day'].rolling(10).mean() + (dataForDaegu['Increase Per Day'].rolling(10).std() * 2)
dataForDaegu['Lower Limit 10 day Bollinger Band'] = dataForDaegu['Increase Per Day'].rolling(10).mean() - (dataForDaegu['Increase Per Day'].rolling(10).std() * 2)
dataForDaeguNew = pd.melt(dataForDaegu, id_vars=['date'], value_vars=['Increase Per Day', '10 Days Moving Average','Upper Limit 10 day Bollinger Band','Lower Limit 10 day Bollinger Band'])
display(dataForDaeguNew.head())
fig = px.line(dataForDaeguNew, x="date", y="value", color='variable', title="Daegu Cases Per Day")
fig['data'][2]['line']['color']="rgba(147,212,219,0.8)"
fig['data'][3]['line']['color']="rgba(157,212,219,0.8)"
fig.update_layout(hovermode='x')
fig.show()
dataForDaeguNew['date'] = pd.to_datetime(dataForDaeguNew['date'])
dataForDaeguNew['Month'] = dataForDaeguNew['date'].dt.month
dataForDaeguNew['Day'] = dataForDaeguNew['date'].dt.day_name()
dataForDaeguMonth = dataForDaeguNew[['variable','Month']]
dataForDaeguMonth = dataForDaeguNew.groupby(['Month','variable'])['value'].mean()
dataForDaeguMonth = dataForDaeguMonth.reset_index()
display(dataForDaeguMonth.head())
fig = px.bar(dataForDaeguMonth, x='Month', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
dataForDaeguMonth = dataForDaeguNew[['variable','Day']]
dataForDaeguMonth = dataForDaeguNew.groupby(['Day','variable'])['value'].mean()
dataForDaeguMonth = dataForDaeguMonth.reset_index()
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
dataForDaeguMonth['Day'] = pd.Categorical(dataForDaeguMonth['Day'],categories=weekdays)
dataForDaeguMonth = dataForDaeguMonth.sort_values('Day')
display(dataForDaeguMonth.head())
fig = px.bar(dataForDaeguMonth, x='Day', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
timeProvince = pd.read_csv('covid/timeprovince.csv')
data = {'province': ['Seoul', 'Busan', 'Daegu', 'Incheon', 'Gwangju', 'Daejeon',
'Ulsan', 'Sejong', 'Gyeonggi-do', 'Gangwon-do',
'Chungcheongbuk-do', 'Chungcheongnam-do', 'Jeollabuk-do',
'Jeollanam-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Jeju-do'],
'longitude': [127.047325,129.066666,128.600006,126.705208,126.916664,127.385002,
129.316666,127.2822,127.143738,127.920158,
127.935905,126.979874,126.916664,126.9910,
129.263885,128.429581,126.5312],
'latitude': [37.517235,35.166668,35.866669,37.456257,35.166668,36.351002,
35.549999,36.4870,37.603405,37.342220,
36.981304,36.806702,35.166668,34.8679,
35.835354,34.855228,33.4996]}
location = pd.DataFrame(data=data)
mergedData = timeProvince.merge(location, on='province', how='left')
mergedData = mergedData.dropna()
mergedData.head()
fig = px.scatter_mapbox(
mergedData, lat="latitude", lon="longitude",
size="confirmed", size_max=100,
color="deceased", color_continuous_scale=px.colors.sequential.Burg,
hover_name="province",
mapbox_style='dark', zoom=6,
animation_frame="date", animation_group="province"
)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 200
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 200
fig.layout.coloraxis.showscale = False
fig.layout.sliders[0].pad.t = 10
fig.layout.updatemenus[0].pad.t= 10
fig.show()
fig = px.scatter_mapbox(
mergedData, lat="latitude", lon="longitude",
size="confirmed", size_max=100,
color="deceased", color_continuous_scale=px.colors.sequential.Burg,
hover_name="province",
mapbox_style='dark', zoom=6,
)
fig.show()
weatherData = pd.read_csv('covid/weather.csv')
weatherData = weatherData.set_index('date')
weatherData = weatherData['2020-01-01' : '2020-08-31'].reset_index()
display(weatherData.head())
weatherDataForHeatmap = weatherData.pivot("date", "province", "avg_temp")
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(weatherDataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Weather Over Time')
high_direction_province = weatherData.copy()
high_direction_province['month'] = high_direction_province['date'].apply(pd.to_datetime).dt.month
high_direction_province.head()
high_direction_province_grouped = high_direction_province.groupby(['province','month']).agg({'avg_temp':['mean','std','min','max'],
'min_temp':['mean','std'],
'max_temp':['mean','std'],
'max_wind_speed':['mean','std'],
'most_wind_direction':['mean','std'],
'avg_relative_humidity':['mean','std']})
print('Agg Data')
display(high_direction_province_grouped.head())
high_direction_province_grouped = high_direction_province_grouped.stack().stack()
high_direction_province_grouped = high_direction_province_grouped.reset_index()
high_direction_province_grouped.columns = ['Province','Month', 'Measurement Type','Weather Type','Value']
high_direction_province_grouped = high_direction_province_grouped.set_index(['Weather Type','Measurement Type','Province','Month'])
print('Indexed Data')
display(high_direction_province_grouped.head())
busan_data = high_direction_province_grouped.loc[(['avg_temp','most_wind_direction'],slice(None),'Busan'), :].sort_values(['Weather Type','Month'])
busan_data.head()
busan_data_mean = high_direction_province_grouped.loc[(['avg_temp','most_wind_direction'],'mean','Busan'), :].sort_values(['Weather Type','Month']).reset_index()
display(busan_data_mean.head())
fig = px.bar(busan_data_mean, x='Month', y='Value', color='Weather Type', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
busan_temp = busan_data_mean[busan_data_mean['Weather Type'] == 'avg_temp']['Value']
busan_temp = pd.DataFrame(busan_temp).reset_index().drop('index',axis=1)
busan_temp.columns = ['avg_temp']
# display(busan_temp)
busan_wind_direction = busan_data_mean[busan_data_mean['Weather Type'] == 'most_wind_direction']['Value']
busan_wind_direction = pd.DataFrame(busan_wind_direction).reset_index().drop('index',axis=1)
busan_wind_direction.columns = ['most_wind_direction']
# display(busan_wind_direction)
plot_table = busan_wind_direction.join(busan_temp)
display(plot_table)
fig = px.scatter(plot_table, x="most_wind_direction", y="avg_temp", trendline="ols", title="Avg Temp & Wind Direction")
fig.update_layout(hovermode='x')
fig.show()
With a R squared of 0.79, there is a strong correlation between the avg-temp and the most wind direction.
The wind direction increases whether there is a decrease in temperature
all_data = all_data.reset_index()
all_data_temp = all_data[all_data['Weather Type'] == 'avg_temp']['Value'] all_data_temp = all_data_temp.reset_index() all_data_temp = all_data_temp.drop('index',axis=1) all_data_temp.columns = ['temp']
all_data_wind = all_data[all_data['Weather Type'] == 'most_wind_direction']['Value'] all_data_wind = all_data_wind.reset_index() all_data_wind = all_data_wind.drop('index',axis=1) all_data_wind.columns = ['wind']
all_data_plot = all_data_temp.join(all_data_wind) display(all_data_plot.head())
fig = px.scatter(all_data_plot, x="wind", y="temp", trendline="ols", title="Avg Temp & Wind Direction") fig.update_layout(hovermode='x') fig.show()
With a R squared of 0.009, it seems that other region does not follow similar trend as Busan.
regionData = pd.read_csv('covid/region.csv')
pd.set_option('display.max_rows', regionData.shape[0]+1)
display(regionData.head())
fig = px.scatter_mapbox(
regionData[regionData.city != 'Korea'],
text="city",
lat="latitude",
lon="longitude",
color="elderly_population_ratio",
size="nursing_home_count",
color_continuous_scale=px.colors.sequential.Burg,
size_max=100,
zoom=6,
title="Number of nusing home and elder population ratio across Korea")
fig.show()
page = requests.get("https://www.worldometers.info/coronavirus")
page.status_code
# page.content
soup = BeautifulSoup(page.content, 'lxml')
# print(soup.prettify())
covidTable = soup.find('table', attrs={'id': 'main_table_countries_today'})
# covidTable
rows = covidTable.find_all("tr", attrs={"style": ""})
covidData = []
for i,data in enumerate(rows):
if i == 0:
covidData.append(data.text.strip().split("\n")[:13])
else:
covidData.append(data.text.strip().split("\n")[:12])
# covidData
covidTable = pd.DataFrame(covidData[1:], columns=covidData[0][:12])
covidTable = covidTable[~covidTable['#'].isin(['World', 'Total:'])]
covidTable = covidTable.drop('#', axis =1)
covidTable.head()
fig = px.bar(covidTable, y='TotalCases', x='Country,Other')
fig.update_layout(hovermode='x')
fig.show()
We will be using log because the data is highly skewed
covidTable['logTotalCases'] = np.log(covidTable['TotalCases'].str.replace(r'\D', '').astype(int))
covidTable = covidTable.sort_values(by=['logTotalCases'])
fig = px.bar(covidTable, y='logTotalCases', x='Country,Other')
fig.update_layout(hovermode='x')
fig.show()
pd.set_option('display.max_rows', covidTable.shape[0]+1)
covidTable['TotalDeaths'].str.strip()
covidTable['TotalDeaths'].replace('', np.nan, inplace=True)
covidTable['TotalDeaths'].replace(' ', np.nan, inplace=True)
covidTable['TotalDeaths'] = covidTable['TotalDeaths'].fillna(0)
covidTable['logTotalDeath'] = np.log(covidTable['TotalDeaths'].str.replace(r'\D', '').astype(float))
covidTable = covidTable.sort_values(by=['logTotalDeath'])
display(covidTable.head())
fig = px.bar(covidTable, y='logTotalDeath', x='Country,Other')
fig.update_layout(hovermode='x')
fig.show()
fig = px.scatter(covidTable, x="logTotalDeath", y="logTotalCases", trendline="ols", title="Total Cases againist Total Death (Log)")
fig.update_layout(hovermode='x')
fig.show()
With a R squared value of 0.88, there is a strong correlation between the cases and the deaths.
The more cases there are, the higher the amount of death
We will create a model which will predict whether a COVID-19 patient will survive given a certain set of conditions
patientData = pd.read_csv('covid/patientinfo.csv')
patientModellingData = patientData[['sex','age','province','confirmed_date','released_date','deceased_date','state']]
## We removed isolated patient since it is not confirmed if they survived or not
patientModellingData = patientModellingData[patientModellingData['state'] != 'isolated']
nullList = ['sex','age','confirmed_date']
for item in nullList:
patientModellingData = patientModellingData[~patientModellingData[item].isnull()]
display(patientModellingData.head())
cols = ['released_date', 'deceased_date', 'confirmed_date']
patientModellingData[cols] = patientModellingData[cols].apply(pd.to_datetime, errors='coerce', axis=1)
def calculate_number_of_treatment_days(row):
if row["released_date"] is not pd.NaT:
treatmentDays = pd.to_numeric((row['released_date'] - row['confirmed_date']).days)
return(treatmentDays)
elif row["deceased_date"] is not pd.NaT:
treatmentDays = pd.to_numeric((row['deceased_date'] - row['confirmed_date']).days)
return(treatmentDays)
else:
return None
patientModellingData['Treatment Days'] = patientModellingData.apply(calculate_number_of_treatment_days, axis=1)
patientModellingData = patientModellingData[~patientModellingData['Treatment Days'].isnull()]
patientModellingDataTreatment = patientModellingData[['sex','age','province','state','Treatment Days']]
display(patientModellingDataTreatment.head())
genders = {"male": 0, "female": 1}
patientModellingDataTreatment['sex'] = patientModellingDataTreatment['sex'].map(genders)
display(patientModellingDataTreatment.head())
state = {"released": 0, "deceased": 1}
patientModellingDataTreatment['state'] = patientModellingDataTreatment['state'].map(state)
display(patientModellingDataTreatment.head())
age = {'0s':0, '10s':1, '20s':2, '30s':3, '40s':4, '50s':5, '60s':6, '70s':7, '80s':8, '90s':9}
patientModellingDataTreatment['age'] = patientModellingDataTreatment['age'].map(age)
display(patientModellingDataTreatment.head())
provinceDummy = pd.get_dummies(patientModellingDataTreatment['province'])
provinceDummy
dataForModelling = patientModellingDataTreatment.join(provinceDummy)
dataForModelling = dataForModelling.drop('province',axis=1)
dataForModelling
plt.figure(figsize=(10,5))
sns.heatmap(dataForModelling.corr(), cmap="RdPu")
dataForModelling['age'] = pd.to_numeric(dataForModelling['age'])
dataForModellingForCleaning = dataForModelling.copy()
display(dataForModellingForCleaning.count())
print(np.any(np.isnan(dataForModellingForCleaning)))
print(np.all(np.isfinite(dataForModellingForCleaning)))
def clean_dataset(df):
assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
return df[indices_to_keep].astype(np.float64)
clean_dataset(dataForModellingForCleaning)
display(dataForModellingForCleaning.count())
print(np.any(np.isnan(dataForModellingForCleaning)))
print(np.all(np.isfinite(dataForModellingForCleaning)))
x = dataForModellingForCleaning.drop("state", axis=1)
y = dataForModellingForCleaning["state"]
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.33, random_state=42)
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
sgd.score(X_train, Y_train)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
results = pd.DataFrame({
'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes', 'Perceptron',
'Stochastic Gradient Decent',
'Decision Tree'],
'Score': [acc_linear_svc, acc_knn, acc_log,
acc_random_forest, acc_gaussian, acc_perceptron,
acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)
feature_cols = x.columns
dot_data = StringIO()
export_graphviz(decision_tree, out_file = dot_data,
feature_names = feature_cols,
filled = True, rounded = True,
special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('covidTree.png')
Image(graph.create_png())
# Entropy 0 == no disorder // perfect knowledge, perfect classification
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances1 = importances.sort_values('importance',ascending=False).set_index('feature')
importances1.head(15)
barData = importances.reset_index()
fig = px.bar(barData, x='feature', y='importance')
fig.update_layout(hovermode='x')
fig.show()
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
result1_train = confusion_matrix(Y_train, predictions)
display(result1_train)
p1_train = precision_score(Y_train, predictions)
r1_train = recall_score(Y_train, predictions)
f1_train = f1_score(Y_train, predictions)
print("Precision:", p1_train)
print("Recall:", r1_train)
print("F-Score:", f1_train)
predictions = cross_val_predict(random_forest, X_test, Y_test, cv=3)
result1_test = confusion_matrix(Y_test, predictions)
display(result1_test)
p1_test = precision_score(Y_test, predictions)
r1_test = recall_score(Y_test, predictions)
f1_test = f1_score(Y_test, predictions)
print("Precision:", p1_test)
print("Recall:", r1_test)
print("F-Score:", f1_test)
Lets observe what happened if we drop the treatment days column Will the model be more accurate?
x2 = dataForModellingForCleaning.drop(["state","Treatment Days"], axis=1)
y2 = dataForModellingForCleaning["state"]
X_train, X_test, Y_train, Y_test = train_test_split(x2, y2, test_size=0.33, random_state=42)
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
results = pd.DataFrame({
'Model': ['Random Forest','Decision Tree'],
'Score': [acc_random_forest, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(2)
feature_cols = x2.columns
dot_data = StringIO()
export_graphviz(decision_tree, out_file = dot_data,
feature_names = feature_cols,
filled = True, rounded = True,
special_characters = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('covidTree2.png')
Image(graph.create_png())
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances2 = importances.sort_values('importance',ascending=False).set_index('feature')
importances2.head(15)
barData = importances.reset_index()
fig = px.bar(barData, x='feature', y='importance')
fig.update_layout(hovermode='x')
fig.show()
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
result2_train = confusion_matrix(Y_train, predictions)
display(result2_train)
p2_train = precision_score(Y_train, predictions)
r2_train = recall_score(Y_train, predictions)
f2_train = f1_score(Y_train, predictions)
print("Precision:", p2_train)
print("Recall:", r2_train)
print("F-Score:", f2_train)
predictions = cross_val_predict(random_forest, X_test, Y_test, cv=3)
result2_test = confusion_matrix(Y_test, predictions)
display(result2_test)
p2_test = precision_score(Y_test, predictions)
r2_test = recall_score(Y_test, predictions)
f2_test = f1_score(Y_test, predictions)
print("Precision:", p2_test)
print("Recall:", r2_test)
print("F-Score:", f2_test)
The old model is better than the new model even though it has a lower precision rate.
The number of survivor in COVID-19 is much higher than the number of deceased.
Hence, it is much harder to predict who will pass away from the infection.
The new model is unable to predict whether a patient will pass away.
The number of false negatives outweigh the number of true negatives.
On the other hand, the old model is able to predict the deceased much accurately than the new model.
With that in mind, this also indicated that the number of days of treatment is vital.
Generally, those who passed away from the infection had lesser days of treatment. However, is it right for us to conclude that lesser amount of treatment days, the higher the fatality rate? Are the risk of dying higher during first few days/weeks of treatment? Or is there something more to it?
This situation is something similar to Abraham Wald and the Missing Bullet Holes. @penguinpress summed up the situation perfectly:
You don't want your planes to get shot down by enemy fighters, so you armor them. But armor makes the plane heavier, and heavier planes are less maneuverable and use more fuel. Armoring the planes too much is a problem; armoring the planes too little is a problem. Somewhere in between there's an optimum.
When American planes came back from engagements over Europe, they were covered in bullet holes. But the damage wasn't uniformly distributed across the aircraft. There were more bullet holes in the fuselage, not so many in the engines.

At first glance, it seems reasonable to focus the armor on the fuselage. However, the armor, said Wald, doesn't go where the bullet holes are. It goes where the bullet holes aren't: on the engines.
The missing bullet holes were on the missing planes. The reason planes were coming back with fewer hits to the engine is that planes that got hit in the engine weren't coming back.
If you go to the recovery room at the hospital, you'll see a lot more people with bullet holes in their legs than people with bullet holes in their chests. But that's not because people don't get shot in the chest; it's because the people who get shot in the chest don't recover.
The link to the excellent article written by @penguinpress is in the credit section below.
In this case, our model only consists of people who have passed away after they have gone through treatment. We have excluded people who have died even before they have the opportunity to go through treatment!Â
Even after we have excluded those who have not undergo treatment before they passed away, our model indicated that the number of treatment days is the strongest factor among gender, location and age. Treatment days has a whooping percentage of 60% in terms of importance next to age which has a importance of 20%.
The chances of survival increases as the number of treatment days increases. Hence, the faster detection rate, the earlier the treatment, the more days and opportunities to treat the infection, the better the chances of survival. The Korea government put a very strong emphasis on COVID-19 detection programme and hence they managed to contain the COVID-19 situation.
Various cities among the world were also in lockdown to prevent the rapid spread of COVID-19. The healthcare system will be overwhelmed if COVID-19 were to spread widely which will cause the death rate to escalate as more people will have lesser opportunity to undergo treatment.
The older the person, the higher the chances of infection and death. Hence it is not advisable for elderly people to roam around unless it is necessary.
In terms of locations, the majority of the cases came from Daegu and the sources of infection is from places such as church and clubs. The higher the amount of contact, the greater the chances of infection. It is advisable for the authority to close such areas until a vaccine for COVID-19 is found. During May to June, there is an increase in COVID-19 cases due to complacency.
https://www.aa.com.tr/en/asia-pacific/s-korea-sees-mass-covid-19-cases-linked-to-night-clubs/1838031
https://www.channelnewsasia.com/news/asia/south-korea-covid-19-church-backlash-13092284
Will the model's accuracy improve if we include patient who died without any treatment?
patientData = pd.read_csv('covid/patientinfo.csv')
patientModellingData = patientData[['sex','age','province','confirmed_date','released_date','deceased_date','state']]
deadPatientModellingData = patientData[patientData['state'] == 'deceased']
totalDiedPatient = deadPatientModellingData.count()
display(totalDiedPatient)
display('***********')
pd.set_option('display.max_rows', deadPatientModellingData.shape[0]+1)
totalDiedPaitentWithoutTreatment = deadPatientModellingData[deadPatientModellingData['deceased_date'].isnull() == True].count()
display(totalDiedPaitentWithoutTreatment)
Assuming patient who died without any released_date or deceased_date are those died without any treatment
There are about 12 patient who died without any treatment
str(12/78 * 100) + '%'
This accounts for 15% of the total death
We will simply add another line of logic to return treatment days = 0
if the patient is deceased and both released and deceased date are empty
In addition, we will add in the month when the patient is confirmed to have covid cases
patientData = pd.read_csv('covid/patientinfo.csv')
patientModellingData = patientData[['sex','age','province','confirmed_date','released_date','deceased_date','state']]
## We removed isolated patient since it is not confirmed if they survived or not
patientModellingData = patientModellingData[patientModellingData['state'] != 'isolated']
nullList = ['sex','age','confirmed_date']
for item in nullList:
patientModellingData = patientModellingData[~patientModellingData[item].isnull()]
display(patientModellingData.head())
cols = ['released_date', 'deceased_date', 'confirmed_date']
patientModellingData[cols] = patientModellingData[cols].apply(pd.to_datetime, errors='coerce', axis=1)
def calculate_number_of_treatment_days(row):
if row["released_date"] is not pd.NaT:
treatmentDays = pd.to_numeric((row['released_date'] - row['confirmed_date']).days)
return(treatmentDays)
elif row["deceased_date"] is not pd.NaT:
treatmentDays = pd.to_numeric((row['deceased_date'] - row['confirmed_date']).days)
return(treatmentDays)
elif row["deceased_date"] is pd.NaT and row["released_date"] is pd.NaT:
if row["state"] == 'deceased' and row["state"] != 'released':
return 0
else:
return None
else:
return None
patientModellingData['Treatment Days'] = patientModellingData.apply(calculate_number_of_treatment_days, axis=1)
patientModellingData = patientModellingData[~patientModellingData['Treatment Days'].isnull()]
patientModellingDataTreatment = patientModellingData[['sex','age','confirmed_date','province','state','Treatment Days']]
patientModellingDataTreatment['confirmed_month'] = patientModellingDataTreatment['confirmed_date'].dt.month
patientModellingDataTreatment = patientModellingDataTreatment.drop('confirmed_date', axis=1)
display(patientModellingDataTreatment.head())
genders = {"male": 0, "female": 1}
patientModellingDataTreatment['sex'] = patientModellingDataTreatment['sex'].map(genders)
state = {"released": 0, "deceased": 1}
patientModellingDataTreatment['state'] = patientModellingDataTreatment['state'].map(state)
age = {'0s':0, '10s':1, '20s':2, '30s':3, '40s':4, '50s':5, '60s':6, '70s':7, '80s':8, '90s':9}
patientModellingDataTreatment['age'] = patientModellingDataTreatment['age'].map(age)
display(patientModellingDataTreatment.head())
dataForModelling = patientModellingDataTreatment.join(provinceDummy)
dataForModelling = dataForModelling.drop('province',axis=1)
dataForModelling
clean_dataset(dataForModelling)
display(dataForModelling.count())
print(np.any(np.isnan(dataForModelling)))
print(np.all(np.isfinite(dataForModelling)))
x3 = dataForModelling.drop(["state"], axis=1)
y3 = dataForModelling["state"]
X_train, X_test, Y_train, Y_test = train_test_split(x3, y3, test_size=0.33, random_state=42)
## Random_forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
## Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
results = pd.DataFrame({
'Model': ['Random Forest','Decision Tree'],
'Score': [acc_random_forest, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(2)
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances3 = importances.sort_values('importance',ascending=False).set_index('feature')
importances3.head(15)
barData = importances.reset_index()
fig = px.bar(barData, x='feature', y='importance')
fig.update_layout(hovermode='x')
fig.show()
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
result3_train = confusion_matrix(Y_train, predictions)
display(result3_train)
p3_train = precision_score(Y_train, predictions)
r3_train = recall_score(Y_train, predictions)
f3_train = f1_score(Y_train, predictions)
print("Precision:", p3_train)
print("Recall:", r3_train)
print("F-Score:",f3_train)
predictions = cross_val_predict(random_forest, X_test, Y_test, cv=3)
result3_test = confusion_matrix(Y_test, predictions)
display(result3_test)
p3_test = precision_score(Y_test, predictions)
r3_test = recall_score(Y_test, predictions)
f3_test = f1_score(Y_test, predictions)
print("Precision:", p3_test)
print("Recall:", r3_test)
print("F-Score:", f3_test)
Precision/recall tradeoff
# getting the probabilities of our predictions
y_scores = random_forest.predict_proba(X_train)
y_scores = y_scores[:,1]
precision, recall, threshold = precision_recall_curve(Y_train, y_scores)
def plot_precision_and_recall(precision, recall, threshold):
plt.plot(threshold, precision[:-1], "r-", label="precision", linewidth=5)
plt.plot(threshold, recall[:-1], "b", label="recall", linewidth=5)
plt.xlabel("threshold", fontsize=10)
plt.legend(loc="upper right", fontsize=10)
plt.ylim([0, 1])
plt.figure(figsize=(10, 5))
plot_precision_and_recall(precision, recall, threshold)
plt.show()
print('******************************')
print('Model without treatment day: ')
print('importance:', importances2.head(3))
display(result2_train)
display(result2_test)
print('Model with treatment day :')
print('importance:', importances1.head(3))
display(result1_train)
display(result1_test)
print('******************************')
print('Model with treatment day, those who died without treatment and infected month:')
print('importance:', importances3.head(3))
display(result3_train)
display(result3_test)
print('******************************')
print('Model without treatment day: ')
print("Train Data Precision:", p2_train)
print("Train Data Recall:", r2_train)
print("Train Data F-Score:", f2_train)
print("Test Data Precision:", p2_test)
print("Test Data Recall:", r2_test)
print("Test Data F-Score:", f2_test)
print('******************************')
print('Model with treatment day :')
print("Train Data Precision:", p1_train)
print("Train Data Recall:", r1_train)
print("Train Data F-Score:", f1_train)
print("Test Data Precision:", p1_test)
print("Test Data Recall:", r1_test)
print("Test Data F-Score:", f1_test)
print('******************************')
print('Model with treatment day, those who died without treatment and infected month:')
print("Train Data Precision:", p3_train)
print("Train Data Recall:", r3_train)
print("Train Data F-Score:", f3_train)
print("Test Data Precision:", p3_test)
print("Test Data Recall:", r3_test)
print("Test Data F-Score:", f3_test)
The model accuracy improved after including the month which the patient is infected and those who died without treatment
Soh, Z. (2020). 02a.bank-marketing.ipynb [Scholarly project]. Retrieved 2020.
Datartist. (2020, July 13). Data Science for COVID-19 (DS4C). Retrieved September 16, 2020, from https://www.kaggle.com/kimjihoo/coronavirusdataset
Press, P. (2016, July 14). Abraham Wald and the Missing Bullet Holes. Retrieved September 16, 2020, from https://medium.com/@penguinpress/an-excerpt-from-how-not-to-be-wrong-by-jordan-ellenberg-664e708cfc3d
Anand. (2019, March 11). Network Graph with AT&T data using Plotly. Retrieved September 16, 2020, from https://medium.com/@anand0427/network-graph-with-at-t-data-using-plotly-a319f9898a02
Donges, N. (2018, May 15). Predicting the Survival of Titanic Passengers. Retrieved September 16, 2020, from https://towardsdatascience.com/predicting-the-survival-of-titanic-passengers-30870ccc7e8
Rostami, D. (2020, March 22). Coronavirus Time Series Map Animation. Retrieved September 16, 2020, from https://datacrayon.com/posts/statistics/data-is-beautiful/coronavirus-time-series-map-animation/
case_id - The ID of the infection caseprovince - Special City / Metropolitan City / Province(-do)city - City(-si) / Country (-gun) / District (-gu)group - TRUE: group infection / FALSE: not groupinfection_case - the infection case (the name of group or other cases)confirmed - the accumulated number of the confirmedlatitude - the latitude of the group (WGS84)longitude - the longitude of the group (WGS84)patient_id - the ID of the patientsex - the sex of the patientage - the age of the patientcountry - the country of the patientprovince - the province of the patientcity - the city of the patientinfection_case - the case of infectioninfected_by - the ID of who infected the patientcontact_number - the number of contacts with peoplesymptom_onset_date - the date of symptom onsetconfirmed_date - the date which the patient confirmed to have the viusreleased_date - the date which the patient is releaseddeceased_date - the date which the patient passed awaystate - the state of the patient: isolated, deceased, releasedpolicy_id - the ID of the policycountry - the country that implemented the policytype - the type of the policygov_policy - the policy of the governmentdetail - the detail of the policystart_date - the start date of the policyend_date - the end date of the policycode - the code of the regionprovince - Special City / Metropolitan City / Province(-do)city - City(-si) / Country (-gun) / District (-gu)latitude - the latitude of the visit (WGS84)longitude - the longitude of the visit (WGS84)elementary_school_count - the number of elementary schoolskindergarten_count - the number of kindergartensuniversity_count - the number of universitiesacademy_ratio - the ratio of academieselderly_population_ratio - the ratio of the elderly populationelderly_alone_ratio - the ratio of the elderly who are alonenursing_home_count - the number of nursing homedate - YYYY-MM-DDcold - the search volume of 'cold' in Korean languageflu - the search volume of 'flu' in Korean languagepneumonia - the search volume of 'pneumonia' in Korean languagedate - YYYY-MM-DDtime - Time (0 = AM 12:00 / 16 = PM 04:00)test - the accumulated number of testsnegative - the accumulated number of negative resultsconfirmed - the accumulated number of positive resultsreleased - the accumulated number of releasesdeceased - the accumulated number of deceasesdate - YYYY-MM-DDtime - Timeage - the age of patientsconfirmed - the accumulated number of the confirmeddeceased - the accumulated number of the deceaseddate - YYYY-MM-DDtime - Timesex - the gender of patientsconfirmed - the accumulated number of the confirmeddeceased - the accumulated number of the deceaseddate - YYYY-MM-DDtime - Timeprovince - the province of South Koreaconfirmed - the accumulated number of the confirmed in the provincereleased - the accumulated number of the released in the provincedeceased - the accumulated number of the deceased in the provincecode - the code of the regionprovince - Special City / Metropolitan City / Province(-do)date - YYYY-MM-DDavg_temp - the average temperaturemin_temp - the lowest temperaturemax_temp - the highest temperatureprecipitation - the daily precipitationmax_wind_speed - the maximum wind speedmost_wind_direction - the most frequent wind directionavg_relative_humidity - the average relative humidity